############################################################
# DigitAnalysis of WBES data
# November, 2023
############################################################

## Change directory

#### Read package
#library(devtools)
#install_github("jlederluis/digitanalysis", force = TRUE)
library(digitanalysis)
library(ggplot2)
library(foreign)

############################################################
### Load WBES Data

fp = "WBES.dta"
D <- read.dta(fp)
head(D)

## Clean Countries 
temp<- data.frame(do.call("rbind", strsplit(D$country, "(?=[A-Za-z])(?<=[0-9])|(?=[0-9])(?<=[A-Za-z])", perl=TRUE)))
colnames(temp) <- c("countryname", "year")
D<- cbind(temp, D)
D$countryname <- gsub(" ", "", D$countryname)

## Clean Emp Data

D_clean <- D[, c("countryname", "l1", "n3", "d2")]
D_clean$l1[D_clean$l1 < 0] <- 0
D_clean$n3[D_clean$n3 < 0] <- 0
D_clean$d2[D_clean$d2 < 0] <- 0

## Need complete cases i.e. no NAs
D_clean = D_clean[complete.cases(D_clean),]

# How many?
length(unique(D_clean$countryname))

# Employees
Data_l1 = process_digit_data(raw_df = D_clean, digit_columns = c("l1"))
	
# Sales
Data_d2 = process_digit_data(raw_df = D_clean, digit_columns = c("d2"))

# Past sales
Data_n3 = process_digit_data(raw_df = D_clean, digit_columns = c("n3"))
	



#######################################
# Plot toggle: turn on before plotting
plot_toggle = TRUE 


##########################################################################################
# #All digits test except first with employment
ADTall_l1 = all_digits_test(digitdata = Data_l1, data_columns = 'l1', skip_first_digit = TRUE, omit_05 = c(0,5), suppress_first_division_plots=TRUE, plot=plot_toggle)
ADTall_l1$sample_sizes

ADTbyCountry_l1 = all_digits_test(digitdata = Data_l1, data_columns = 'l1', skip_first_digit = TRUE, omit_05 = c(0,5), break_out = "countryname", suppress_first_division_plots=TRUE, plot=plot_toggle)
ADTbyCountry_l1$p_values
ADTbyCountry_l1$sample_sizes

# #All digits test except first with sales
ADTall_d2 = all_digits_test(digitdata = Data_d2, data_columns = 'd2', skip_first_digit = TRUE, omit_05 = c(0,5), suppress_first_division_plots=TRUE, plot=plot_toggle)
ADTall_d2$sample_sizes

ADTbyCountry_d2 = all_digits_test(digitdata = Data_d2, data_columns = 'd2', skip_first_digit = TRUE, omit_05 = c(0,5), break_out = "countryname", suppress_first_division_plots=TRUE, plot=plot_toggle)
ADTbyCountry_d2$p_values

#All digits test except first with sales
ADTall_n3 = all_digits_test(digitdata = Data_n3, data_columns = 'n3', skip_first_digit = TRUE, omit_05 = c(0,5), suppress_first_division_plots=TRUE, plot=plot_toggle)
ADTall_n3$sample_sizes

ADTbyCountry_n3 = all_digits_test(digitdata = Data_n3, data_columns = 'n3', skip_first_digit = TRUE, omit_05 = c(0,5), break_out = "countryname", suppress_first_division_plots=TRUE, plot=plot_toggle)
ADTbyCountry_n3$p_values



## Clean P-vals

l1p = ADTbyCountry_l1$p_values
l1p = cbind(rownames(l1p), data.frame(l1p, row.names=NULL))
colnames(l1p) = c("countryname", "l1pval")


d2p = ADTbyCountry_d2$p_values
d2p = cbind(rownames(d2p), data.frame(d2p, row.names=NULL))
colnames(d2p) = c("countryname", "d2pval")

n3p = ADTbyCountry_n3$p_values
n3p = cbind(rownames(n3p), data.frame(n3p, row.names=NULL))
colnames(n3p) = c("countryname", "n3pval")

#fix types
l1p$l1pval <- as.numeric(l1p$l1pval)
d2p$d2pval <- as.numeric(d2p$d2pval)
n3p$n3pval <- as.numeric(n3p$n3pval)

pvals = l1p
pvals = merge(pvals, d2p, by = "countryname")
pvals = merge(pvals, n3p, by = "countryname")

#write.csv(pvals, "PVals.csv", row.names=FALSE)
#####################################################################################################  

## Start here for correlations

pvals <- read.csv("PVals.csv")

####################################################################################################  
## Merge into CPI data
CPI21 <- read.csv("../CPI/CPI_2021.csv")

## Cleaned up among CPI to match WBES 
# Laos -> LaoPDR 
# Slovakia -> SlovakRepublic
# CentralAfricanRepublic-> Centralafricanrepublic
# Cote D'Ivoire -> Côted'Ivoire
# CostaRica -> Costarica
# KyrgyzStan -> KyrgyzRepublic
# Czechia -> Czech Republic 
# CaboVerde -> CapeVerde
# SaintLucia -> StLucia
# SaintVincentandGrenadines -> StVincentandGrenadines
# SouthSudan -> Southsudan 
# DemocraticRepublicoftheCongo -> DRC 

unique(pvals$countryname)[which(! (unique(pvals$countryname) %in% CPI21$countryname))]
# Not in data: 
# Antigua and Barbuda
# Samoa
# St Kitts and Nevis
# West Bank and Gaza
# Tonga 
# Micronesia 
# Belize

sum((unique(pvals$countryname) %in% CPI21$countryname))

	# 147 

CPI21 = merge(pvals, CPI21, by = "countryname")



#bonferroni correct
CPI21$l1flag <- as.numeric(CPI21$l1pval < 0.05/(nrow(CPI21)))
CPI21$d2flag <- as.numeric(CPI21$d2pval < 0.05/(nrow(CPI21)))
CPI21$n3flag <- as.numeric(CPI21$n3pval < 0.05/(nrow(CPI21)))


CPI21$index = CPI21$l1flag + CPI21$d2flag + CPI21$n3flag

### Sample size

nrow(CPI21)

## Correlation for all of them
cor.test(CPI21$index, CPI21$Rank.2021)
# 0.273


# Correlation for each 
cor.test(CPI21$l1flag, CPI21$Rank.2021)
#.192

cor.test(CPI21$d2flag, CPI21$Rank.2021)
#.237

cor.test(CPI21$n3flag, CPI21$Rank.2021)
#.209

#################################################################
### Repeat with WGI 

## Note: names of WGI were edited to match the WBES names as follows:
# Antigua and Barbuda -> Antigua and barbuda
# Cabo Verde -> Cape Verde
# Costa Rica -> Costa rica 
# Congo, Dem. Rep. -> DRC
# Guinea-Bissau -> GuineaBissau
# Yeme, Rep. -> Yemen
# Bahamas, The -> Bahamas 
# Central African Republic -> Central african republic 
# Czechia -> Czech Republic
# Egypt, Arab Rep. -> Egypt
# Russian Federation -> Russia 
# St. Lucia -> St Lucia
# Venezuela, RB -> Venezuela
# Congo, Rep -> Congo
# Gambia, The -> Gambia
# Micronesia, Fed Sts -> Micronesia
# St. Vincent and the Grenadines -> St Vincent and Grenadines
# South Sudan -> South sudan 
# St. Kitts and Nevis -> St Kitts and Nevis
# West Bank and Gaza -> West Bank And Gaza


WGI21 <- read.csv("../WGI/WGI_Control.csv")

WGI21$countryname <- gsub(" ", "", WGI21$Country.Name)

WGI21$countryname[WGI21$countryname == "Coted'Ivoire"] = "Côted'Ivoire" 
WGI21$countryname[WGI21$countryname == "Turkiye"] = "Türkiye" 

sum(!(unique(pvals$countryname) %in% WGI21$countryname))
	# 0 good job

sum(unique(pvals$countryname) %in% WGI21$countryname)

WGI21<- merge(pvals, WGI21, by = "countryname")


#bonferroni correct
WGI21$l1flag <- as.numeric(WGI21$l1pval < 0.05/(nrow(WGI21)))
WGI21$d2flag <- as.numeric(WGI21$d2pval < 0.05/(nrow(WGI21)))
WGI21$n3flag <- as.numeric(WGI21$n3pval < 0.05/(nrow(WGI21)))


WGI21$index <- WGI21$l1flag + WGI21$d2flag + WGI21$n3flag 

# Fix type
WGI21$Control <- as.numeric(WGI21$Control21)

#### Correlation tests
cor.test(WGI21$index, WGI21$Control)
# -0.311078

cor.test(WGI21$l1flag, WGI21$Control)
cor.test(WGI21$d2flag, WGI21$Control)
cor.test(WGI21$n3flag, WGI21$Control)


library(binsreg)




ggplot(WGI21, aes(x = Control, y = index)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + theme_bw() + labs(y= "Number of Tests Failed", x = "WGI Control of Corruption")

d = binsreg(WGI21$index, WGI21$Control, polyreg = 1) 

d + labs(y= "Number of Tests Failed", x = "WGI Control of Corruption")


reg = lm(WGI21$index ~ WGI21$Control)
summary(reg)


########################
# Check/correct for rounding
roundingd2 = rounding_test(digitdata = Data_d2, data_columns = 'd2', break_out='countryname', rounding_patterns = c('0','00','000','0000', '00000', '000000', '5', '50', '500'), plot= TRUE)
roundingn3 = rounding_test(digitdata = Data_n3, data_columns = 'n3', break_out='countryname', rounding_patterns = c('0','00','000','0000', '00000', '000000', '5', '50', '500'), plot= TRUE)
roundingl1 = rounding_test(digitdata = Data_l1, data_columns = 'l1', break_out='countryname', rounding_patterns = c('0','00','000','0000', '00000', '000000', '5', '50', '500'), plot= TRUE)

roundingl1$percent_rounded
# all = 30%
roundingd2$percent_rounded
# all = 57%
roundingn3$percent_rounded
# all = 49%

rd2 = roundingd2$percent_rounded
rd2 = cbind(rownames(rd2), data.frame(rd2, row.names=NULL))
colnames(rd2) = c("countryname", "d2rounding")

rn3 = roundingn3$percent_rounded
rn3 = cbind(rownames(rn3), data.frame(rn3, row.names=NULL))
colnames(rn3) = c("countryname", "n3rounding")

rl1 = roundingl1$percent_rounded
rl1 = cbind(rownames(rl1), data.frame(rl1, row.names=NULL))
colnames(rl1) = c("countryname", "l1rounding")



WGI21 = merge(rd2, WGI21, by = "countryname")

## Does the correlation persist even without highly rounded countries 
cutoff = quantile(roundingd2$percent_rounded, 0.8)

WGI_lessround = WGI21[WGI21$d2rounding < cutoff,]

nrow(WGI_lessround)

cor.test(WGI_lessround$index, WGI_lessround$Control)

#-0.3734498 

# Correlation for each, post-rounding
cor.test(WGI_lessround$l1flag, WGI_lessround$Control)
# -0.3127324 


cor.test(WGI_lessround$d2flag, WGI_lessround$Control)
# -0.2861328 


cor.test(WGI_lessround$n3flag, WGI_lessround$Control)
# -0.2969705 

########################
### Get Plots
## Need to have run all of above 


Sample <- ADTbyCountry_d2$sample_sizes
Sample$countryname <- row.names(Sample)
colnames(Sample) <- c("SampleSizes", "countryname")

WGI21 = merge(Sample, WGI21, by = "countryname")

WGI21[order(WGI21$Control),c("countryname", "SampleSizes", "Control")]

ADTbyCountry_d2$plots$Southsudan
ADTbyCountry_d2$plots$Yemen
ADTbyCountry_d2$plots$Venezuela

ADTbyCountry_d2$plots$Sweden
ADTbyCountry_d2$plots$Finland
ADTbyCountry_d2$plots$Denmark

ADTbyCountry_d2$plots$Kenya
ADTbyCountry_d2$plots$Uganda
ADTbyCountry_d2$plots$Tanzania






########################
## Padding test isn't appropriate
## Show no correlation

#paddingl1 = padding_test(digitdata = Data_l1, break_out = "countryname", data_columns = 'l1', max_length=7, num_digits=5, N=1000, omit_05=c(0,5), simulate=T, suppress_first_division_plots=TRUE, plot=TRUE)
#paddingd2 = padding_test(digitdata = Data_d2, break_out = "countryname", data_columns = 'd2', max_length=7, num_digits=5, N=1000, omit_05=c(0,5), simulate=T, suppress_first_division_plots=TRUE, plot=TRUE)


# paddingn3 = padding_test(digitdata = Data_n3, break_out = "countryname", data_columns = 'n3', max_length=7, num_digits=5, N=1000, omit_05=c(0,5), simulate=T, suppress_first_division_plots=TRUE, plot=TRUE)


# ## Fix format 
# padding_pvals_n3=do.call(rbind.data.frame, paddingn3$p_values)
# colnames(padding_pvals_n3) = c("p10k", "p1k", "p100s", "p10s", "p1s")

# padding_pvals_n3$p10k <- as.numeric(padding_pvals_n3$p10k)
# padding_pvals_n3$p1k <- as.numeric(padding_pvals_n3$p1k)
# padding_pvals_n3$p100s <- as.numeric(padding_pvals_n3$p100s)
# padding_pvals_n3$p10s <- as.numeric(padding_pvals_n3$p10s)
# padding_pvals_n3$p1s <- as.numeric(padding_pvals_n3$p1s)


# #Comparison p
# alpha = 0.05

# padding_pvals_n3$pvaln3index = as.numeric(padding_pvals_n3$p10k < alpha) + 
# as.numeric(padding_pvals_n3$p1k < alpha)
# as.numeric(padding_pvals_n3$p100s < alpha) + 
# as.numeric(padding_pvals_n3$p10s < alpha) + 
# as.numeric(padding_pvals_n3$p1s < alpha) 

# padding_pvals_n3$countryname <- rownames(padding_pvals_n3)


# CPI21 = merge(padding_pvals_n3, CPI21, by = "countryname")

# cor.test(CPI21$pvaln3index, CPI21$Rank.2021)
